Import data

library(Matrix)
library(fdasrvf)
library(lme4)
library(pedigreemm)
library(ggplot2)
library(plotly)

setwd("D:/KCL_2023-2027_PhD/Year 1/Genetics/FMEMs-quantitative-genetics/R_code")
TRFUN25PUP4 = read.delim("TRFUN25PUP4.DAT",header = FALSE)
names(TRFUN25PUP4)<-c("id","sire","dam","trait","x")
df <- data.frame(TRFUN25PUP4)

FirstUniqueIdPos <- which(duplicated(df$id) == FALSE)
N = length(FirstUniqueIdPos) # N = 873 subjects
n = length(df$id) # n = 6860 observations
age_list <- split(df$x,df$id)
trait_list <- split(df$trait,df$id)

## Rescale time interval to [0,1]
## x = (x -min(x))/(max(x) - min(x))
age_list_new <- list()
for (i in 1:N){
  age_list_new[[i]] = (age_list[[i]]-min(age_list[[i]]))/(max(age_list[[i]])-min(age_list[[i]]))
}

df$x_rescaled <- unsplit(age_list_new,df$id)

Data smoothing

Here we smooth growth curves on the logarithmic scale and then take exponential to recover body mass. This imposes positive smoothing. Smoothing parameter \(\lambda_i\) for each curve is selected by GCV and we restrict \(\lambda \le 10^{-4}\).

agefine <- seq(0,1,length=100) # dense time grid
logmass <- matrix(0,100,N) # store smooth log growth curves
pred_mass <- matrix(0,100,N) # store the smoothed mass recovered by taking exp
lam <- rep(0,N) # smoothing parameter used for each log growth curve

for (i in 1:N){
  ss_logmass <- smooth.spline(age_list_new[[i]], log(trait_list[[i]]), cv=FALSE,
                              all.knots=TRUE)
  logmass[,i] <- predict(ss_logmass, agefine)$y
  pred_mass[,i] <- exp(predict(ss_logmass,agefine)$y)
  lam[i] <- ss_logmass$lambda
}

large_lam <- which(lam > 1e-4)
mass_adjusted <- matrix(0,100, length(large_lam))
for ( i in 1: length(large_lam)){
  ss_new <- smooth.spline(age_list_new[[large_lam[i]]], log(trait_list[[large_lam[i]]]), 
                          all.knots = TRUE, lambda = 1e-4) # restrict lambda <=1e-4
  mass_adjusted[,i] <- exp(predict(ss_new, agefine)$y)
}
colnames(mass_adjusted) <- as.character(large_lam)

## Let reform the smoothed growth curves into a matrix
trait_hat <- pred_mass 
for (i in 1: length(large_lam)){
  trait_hat[,large_lam[i]] <- mass_adjusted[,i]
}

Curve Alignment

Align the growth curves using the r package \(\textbf{fdasrvf}\) and extract the mean from the aligned curves.

aligned_mass_process <- time_warping(f=trait_hat, time=agefine)
aligned_mass_curve <- aligned_mass_process$fn
aligned_mean <- aligned_mass_process$fmean
warping_funs <- aligned_mass_process$warping_functions

Functional Principal Component Analysis

We run FPCA on the aligned data and the first three principal components will be used as the basis functions for our model.

fpcaobj_mass <- prcomp(x=t(aligned_mass_curve), retx = TRUE, center = TRUE, rank. = 3)
eigen_mass <- fpcaobj_mass$rotation # eigen vectors

##              [,1]
## [1,] 2.341877e-16
##      [,1]
## [1,]    0
##              [,1]
## [1,] -5.20417e-17

Fit Functional Mixed-Effect Model

We calculate the additive genetic relationship matrix \(\bf{A}\) and fit both fixed and random effects using the same set of functional basis.

pos = df$id[FirstUniqueIdPos] # extract ids for all subjects
sire_id = df$sire[FirstUniqueIdPos] # extract ids for sire
dam_id = df$dam[FirstUniqueIdPos] # extract ids for dam

pede <- editPed(sire = sire_id, dam = dam_id, label = pos)
ped<- with(pede, pedigree(label=label, sire=sire, dam=dam))
A <- getA(ped)[163:1035,163:1035]

Here we compare three different ways to fit data to our genetic model:

  1. Fit raw data:

We need to interpolate the discrete eigen vectors to eigen functions and evaluate eigen functions at the original time points. This will be done using the function convert_to_basisfunctions.

### define the basis functions (eigenfunctions)
phi_list <- list() 
# create an empty list which stores eigenfunctions for 873 subjects
# evaluated at the original time points.

for (i in 1:N){
  phi <- convert_to_basisfunctions(t = agefine, eigenvecs = eigen_mass[,1:3],
                                   tout = age_list_new[[i]])
  phi_list[[i]] <- phi
}

phi <- do.call(rbind,phi_list)
colnames(phi) <- c("phi1", "phi2", "phi3")

### update the dataframe to include basis functions
df1 <- cbind(df,phi)

### use a fixed regression of the same form of random regression 
fmmForm1 <- trait ~ df1$phi1 + df1$phi2 + df1$phi3 + (-1 + df1$phi1 + df1$phi2 + df1$phi3 | df1$id) + 
  (-1 + df1$phi1 + df1$phi2 + df1$phi3 | df1$id) 

system.time(
  ff1 <- fit_genetic_fmm(fmmForm1, df1, A, eigen_mass)
) # user   system  elapsed
##   用户   系统   流逝 
## 106.60   4.55 195.12
summary(ff1)
## Linear mixed model fit by REML ['lmerMod']
## 
## REML criterion at convergence: 60950.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.0919 -0.4668 -0.0205  0.4220  5.9236 
## 
## Random effects:
##  Groups   Name     Variance Std.Dev. Corr       
##  df1.id   df1$phi1 20592.53 143.501             
##           df1$phi2  6157.40  78.469  -0.59      
##           df1$phi3   723.83  26.904   0.49 -0.99
##  df1.id.1 df1$phi1 18316.43 135.338             
##           df1$phi2   285.54  16.898  -0.66      
##           df1$phi3    81.57   9.031   0.74 -0.99
##  Residual            272.98  16.522             
## Number of obs: 6860, groups:  df1$id, 873
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  -42.764      6.406  -6.676
## df1$phi1    1923.638     50.841  37.836
## df1$phi2    -274.299     42.780  -6.412
## df1$phi3     -38.079      5.636  -6.756
## 
## Correlation of Fixed Effects:
##          (Intr) df1$p1 df1$p2
## df1$phi1 -0.948              
## df1$phi2  0.979 -0.964       
## df1$phi3  0.825 -0.700  0.710

Extract the covariance matrices and convert them to covariance functions.

VC1 <- VarCorr(ff1)
CG1 <- VC1[["df1.id"]]
CE1 <- VC1[["df1.id.1"]]

### Convert to genetic covariance function
CG_fun1 <- eigen_mass%*% CG1 %*% t(eigen_mass)
### environmental covariance function
CE_fun1 <- eigen_mass %*% CE1 %*% t(eigen_mass)
### Phenotypic covariance function
P_fun1 <- CG_fun1 + CE_fun1
  1. Fit the smoothed data on the original time points.

We interpolate the smoothed data at the original measurement points.

trait_hat_list <- list()

for (i in 1:N){
  inter_trait <- convert_to_basisfunctions(t = agefine, eigenvecs = trait_hat[,i],
                                   tout = age_list_new[[i]])
  trait_hat_list[[i]] <- inter_trait
}

df2 <- data.frame(id = df$id, trait = unsplit(trait_hat_list, df$id), phi)

fmmForm2 <- trait ~ df2$phi1 + df2$phi2 + df2$phi3 + (-1 + df2$phi1 + df2$phi2 + df2$phi3 | df2$id) + 
  (-1 + df2$phi1 + df2$phi2 + df2$phi3 | df2$id) 

system.time(
  ff2 <- fit_genetic_fmm(fmmForm2, df2, A, eigen_mass)
) # user   system  elapsed
##   用户   系统   流逝 
##  61.21   1.97 113.68
summary(ff2)
## Linear mixed model fit by REML ['lmerMod']
## 
## REML criterion at convergence: 60361.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.1137 -0.4699 -0.0377  0.4124  6.1927 
## 
## Random effects:
##  Groups   Name     Variance Std.Dev. Corr       
##  df2.id   df2$phi1 20420.4  142.90              
##           df2$phi2  5902.0   76.82   -0.59      
##           df2$phi3   743.5   27.27    0.48 -0.99
##  df2.id.1 df2$phi1 18791.0  137.08              
##           df2$phi2   758.5   27.54   -0.47      
##           df2$phi3   223.2   14.94    0.50 -1.00
##  Residual            241.6   15.54              
## Number of obs: 6860, groups:  df2$id, 873
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  -37.514      6.032  -6.219
## df2$phi1    1882.149     48.164  39.078
## df2$phi2    -240.549     40.355  -5.961
## df2$phi3     -36.529      5.433  -6.723
## 
## Correlation of Fixed Effects:
##          (Intr) df2$p1 df2$p2
## df2$phi1 -0.943              
## df2$phi2  0.977 -0.960       
## df2$phi3  0.806 -0.672  0.679
  1. Fit the smoothed data on the dense time grid.
### Reform the aligned data to a dataframe for model fitting
subjectID <- rep(unique(df$id), each=100)
trait_pred <- c(aligned_mass_curve)
basis1 <- rep(eigen_mass[,1], times = 873)
basis2 <- rep(eigen_mass[,2], times = 873)
basis3 <- rep(eigen_mass[,3], times = 873)
new_df <- data.frame(subjectID, trait_pred, basis1, basis2, basis3)
names(new_df) <- c("subjectID", "trait_hat", "basis1", "basis2", "basis3")

fmeFormula <- trait_hat ~ new_df$basis1 + new_df$basis2 + new_df$basis3 + 
  (-1 + new_df$basis1 + new_df$basis2 + new_df$basis3 | new_df$subjectID) + 
  (-1 + new_df$basis1 + new_df$basis2 + new_df$basis3 | new_df$subjectID)

system.time(
  fit_sameBasis <- fit_genetic_fmm(formula= fmeFormula, data=new_df, A = A, phi = eigen_mass)
) 
##    用户    系统    流逝 
## 1179.76   68.99 1894.39
 # user   system   elapsed

summary(fit_sameBasis)
## Linear mixed model fit by REML ['lmerMod']
## 
## REML criterion at convergence: 292502.9
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -15.0463  -0.4291  -0.0128   0.4343  20.6870 
## 
## Random effects:
##  Groups             Name          Variance  Std.Dev. Corr       
##  new_df.subjectID   new_df$basis1 14698.909 121.239             
##                     new_df$basis2   761.155  27.589  -0.23      
##                     new_df$basis3    76.205   8.730  -0.37  0.40
##  new_df.subjectID.1 new_df$basis1 18469.901 135.904             
##                     new_df$basis2    71.553   8.459   0.61      
##                     new_df$basis3   219.271  14.808   0.19 -0.66
##  Residual                             1.349   1.161             
## Number of obs: 87300, groups:  new_df$subjectID, 873
## 
## Fixed effects:
##                Estimate Std. Error t value
## (Intercept)     12.6225     0.1382  91.311
## new_df$basis1 1513.5783    13.8295 109.445
## new_df$basis2   92.0683     3.0843  29.851
## new_df$basis3    7.8831     1.0701   7.366
## 
## Correlation of Fixed Effects:
##             (Intr) nw_d$1 nw_d$2
## new_df$bss1 -0.075              
## new_df$bss2  0.292 -0.207       
## new_df$bss3  0.086 -0.272  0.324